packages <- c("CIMseq", "CIMseq.data", "tidyverse", "scSeqTools", "printr")
purrr::walk(packages, library, character.only = TRUE)
rm(packages)

##DATA
load('../data/CIMseqData.rda')
load('../data/sObj.rda')
#rename classes
renameClasses <- function(class) {
  case_when(
    class == "0" ~ "C.Stem.proximal",
    class == "1" ~ "C.Stem.distal",
    class == "2" ~ "SI.Stem",
    class == "3" ~ "C.Goblet.distal",
    class == "4" ~ "SI.Enterocytes",
    class == "5" ~ "SI.Goblet",
    class == "6" ~ "C.Goblet.proximal",
    class == "7" ~ "Enteroendocrine",
    class == "8" ~ "Tufft",
    class == "9" ~ "SI.Paneth",
    class == "10" ~ "Blood",
    TRUE ~ "error"
  )
}

cOrder <- c(
  "C.Stem.proximal", "C.Goblet.proximal", "C.Stem.distal", "C.Goblet.distal",
  "SI.Goblet", "SI.Paneth", "SI.Stem", "SI.Enterocytes",
  "Enteroendocrine", "Tufft", "Blood"
)

getData(cObjSng, "classification") <- renameClasses(getData(cObjSng, "classification"))
features <- getData(cObjMul, "features")
names(features) <- renameClasses(names(features))
getData(cObjMul, "features") <- features
fractions <- getData(sObj, "fractions")
colnames(fractions) <- renameClasses(colnames(fractions))
sObj@fractions <- fractions

Classification

plotUnsupervisedClass(cObjSng, cObjMul)

plotUnsupervisedMarkers(
  cObjSng, cObjMul,
  c("Lgr5", "Ptprc", "Chga", "Dclk1", "Alpi", "Slc26a3", "Atoh1", "Lyz1"),
  pal = RColorBrewer::brewer.pal(8, "Set1")
)

plotUnsupervisedMarkers(
  cObjSng, cObjMul, "Hoxb13",
  pal = RColorBrewer::brewer.pal(8, "Set1")
)

plotUnsupervisedMarkers(
  cObjSng, cObjMul, "Mki67",
  pal = RColorBrewer::brewer.pal(8, "Set1")
)

classHeatmap(
  data = data.frame(
    gene = rownames(getData(cObjMul, "counts"))[getData(cObjMul, "features")],
    class = names(getData(cObjMul, "features"))
  ),
  counts.log = getData(cObjSng, "counts.log"),
  classes = tibble(
    sample = colnames(getData(cObjSng, "counts")), 
    class = getData(cObjSng, "classification")
  )
)

Show top 10 (by fold change) features per class.

violinPlot <- function(cpm, genes, classes, class) {
  cpm[genes, ] %>%
    t() %>% 
    matrix_to_tibble("sample") %>%
    gather(gene, cpm, -sample) %>%
    full_join(tibble(sample = colnames(cpm), class = classes), by = "sample") %>%
    ggplot() +
    geom_jitter(aes(class, cpm), height = 0, size = 0.1, alpha = 0.25) +
    facet_wrap(~gene, scales = "free_y") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90)) +
    labs(title = paste0(class, " genes"))
}


getGenes <- function(
  cpm, classes, features, class, ntop = 10
) {
  fc <- foldChangePerClass(
    cpm[features[names(features) == class], ], 
    tibble(sample = colnames(cpm), class = classes)
  )
  rownames(fc[order(fc[, class], decreasing = TRUE), ])[1:ntop]
}

cpm <- getData(cObjSng, "counts.cpm")
c <- "C.Stem.distal"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "C.Stem.proximal"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "C.Goblet.distal"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "C.Goblet.proximal"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "SI.Stem"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "SI.Goblet"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "SI.Enterocytes"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "SI.Paneth"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

Deconvolution

plotSwarmCircos(sObj, cObjSng, cObjMul, weightCut = 0, classOrder = cOrder)

Filtering

Only detected duplicates and triplicates.
Only ERCC estimated cell number <= 4.
Weight cutoff = 5.

adj <- adjustFractions(cObjSng, cObjMul, sObj, binary = TRUE)
samples <- rownames(adj)
rs <- rowSums(adj)
keep <- rs == 2 | rs == 3

plotSwarmCircos(
  filterSwarm(sObj, keep), cObjSng, cObjMul, weightCut = 5, 
  classOrder = cOrder, theoretical.max = 4
)

False positive stats

efm <- getEdgesForMultiplet(sObj, cObjSng, cObjMul, theoretical.max = 4) %>% 
  mutate(conn = map2(from, to, ~tibble(
    from = sort(c(.x, .y))[1], to = sort(c(.x, .y))[2]
  ))) %>% 
  select(-from, -to) %>% 
  unnest() %>% 
  distinct()

edgeStats <- calculateEdgeStats(sObj, cObjSng, cObjMul, theoretical.max = 4) %>%
  mutate(conn = map2(from, to, ~tibble(
    from = sort(c(.x, .y))[1], to = sort(c(.x, .y))[2]
  ))) %>% 
  select(-from, -to) %>% 
  unnest() %>% 
  select(from, to, everything()) %>%
  distinct() %>%
  as_tibble()

fp <- efm %>%
  mutate(fp = case_when(
    str_detect(from, "^SI") & str_detect(to, "^C") ~ TRUE,
    str_detect(to, "^SI") & str_detect(from, "^C") ~ TRUE,
    str_detect(from, "proximal") & str_detect(to, "distal") ~ TRUE,
    str_detect(to, "proximal") & str_detect(from, "distal") ~ TRUE,
    TRUE ~ FALSE
  )) %>%
  group_by(sample) %>%
  summarize(fp = sum(fp)) %>%
  {setNames(pull(., fp), pull(., sample))}

fp.c <- efm %>%
  inner_join(select(MGA.Meta, sample, sub_tissue), by = "sample") %>%
  mutate(fp = case_when(
    (str_detect(from, "^C") | str_detect(to, "^C")) & sub_tissue == "small_intestine" ~ TRUE,
    (str_detect(from, "^SI") | str_detect(to, "^SI")) & sub_tissue == "colon" ~ TRUE,
    TRUE ~ FALSE
  )) %>%
  group_by(sample) %>%
  summarize(fp = sum(fp)) %>%
  {setNames(pull(., fp), pull(., sample))}

If we define a false positive as a connection between either, the small intestine and the colon or a connection between the proximal and distal colon based on the classifications, the number of detected false positives is as follows:
(Definition 1) Detected 1304 false positive connections out of 2230 total connections. Of those multiplets with a detected connection, 782 multiplets have at least one false positive out of 1216 total multiplets. The per multiplet false positive distribution is shown in the plot below.

ggplot() +
  geom_bar(aes(fp)) +
  theme_bw() +
  labs(x = "False positives per multiplet") +
  scale_x_continuous(breaks = 0:max(fp))

If we instead define the false positives as multiplets that were sorted as small intestine that contain connections with colon classes and vice versa, the number of false positives is as follows:
(Definition 2) Detected 547 false positive connections out of 2230 total connections. Of those multiplets with a detected connection, 322 multiplets have at least one false positive out of 1216 total multiplets.

We can also evaluate false positives via definition 1 and 2 and also include whether or not the connection is significant at alpha 0.05.

(Definition 1)

edgeStats %>% 
  mutate(significant = pval < 0.05) %>%
  mutate(`false positive` = case_when(
    str_detect(from, "^SI") & str_detect(to, "^C") ~ TRUE,
    str_detect(to, "^SI") & str_detect(from, "^C") ~ TRUE,
    str_detect(from, "proximal") & str_detect(to, "distal") ~ TRUE,
    str_detect(to, "proximal") & str_detect(from, "distal") ~ TRUE,
    TRUE ~ FALSE
  )) %>% 
  group_by(significant, `false positive`) %>% 
  summarize(n = sum(weight)) %>% 
  ungroup() %>% 
  as.data.frame()
significant false positive n
FALSE FALSE 161
FALSE TRUE 1880
TRUE FALSE 1691
TRUE TRUE 728

(Definition 2)

efm %>%
  inner_join(edgeStats, by = c("from", "to")) %>% 
  inner_join(select(MGA.Meta, sample, sub_tissue), by = "sample") %>%
  mutate(significant = pval < 0.05) %>%
  mutate(`false positive` = case_when(
    (str_detect(from, "^C") | str_detect(to, "^C")) & sub_tissue == "small_intestine" ~ TRUE,
    (str_detect(from, "^SI") | str_detect(to, "^SI")) & sub_tissue == "colon" ~ TRUE,
    TRUE ~ FALSE
  )) %>% 
  count(significant, `false positive`)
significant false positive n
FALSE FALSE 1404
FALSE TRUE 637
TRUE FALSE 1962
TRUE TRUE 457

The type of false positive connections and their number are shown below.

(Definition 1)

efm %>%
  mutate(fp = case_when(
    str_detect(from, "^SI") & str_detect(to, "^C") ~ TRUE,
    str_detect(to, "^SI") & str_detect(from, "^C") ~ TRUE,
    str_detect(from, "proximal") & str_detect(to, "distal") ~ TRUE,
    str_detect(to, "proximal") & str_detect(from, "distal") ~ TRUE,
    TRUE ~ FALSE
  )) %>%
  filter(fp) %>%
  count(from, to) %>%
  arrange(desc(n)) %>%
  as.data.frame()
from to n
C.Stem.distal C.Stem.proximal 321
C.Goblet.distal C.Stem.proximal 234
C.Stem.proximal SI.Stem 154
C.Goblet.distal C.Goblet.proximal 140
C.Stem.proximal SI.Goblet 100
C.Goblet.proximal C.Stem.distal 77
C.Stem.proximal SI.Enterocytes 55
C.Goblet.distal SI.Goblet 51
C.Stem.distal SI.Goblet 42
C.Goblet.proximal SI.Goblet 40
C.Stem.distal SI.Enterocytes 36
C.Stem.distal SI.Stem 26
C.Goblet.distal SI.Enterocytes 7
C.Stem.proximal SI.Paneth 7
C.Goblet.distal SI.Stem 6
C.Goblet.proximal SI.Stem 4
C.Goblet.proximal SI.Enterocytes 2
C.Goblet.distal SI.Paneth 1
C.Stem.distal SI.Paneth 1

(Definition 2)

efm %>%
  inner_join(select(MGA.Meta, sample, sub_tissue)) %>%
  mutate(fp = case_when(
    (str_detect(from, "^C") | str_detect(to, "^C")) & sub_tissue == "small_intestine" ~ TRUE,
    (str_detect(from, "^SI") | str_detect(to, "^SI")) & sub_tissue == "colon" ~ TRUE,
    TRUE ~ FALSE
  )) %>%
  filter(fp) %>%
  count(from, to) %>%
  arrange(desc(n)) %>%
  as.data.frame()
## Joining, by = "sample"
from to n
C.Stem.proximal SI.Stem 154
C.Stem.proximal SI.Goblet 100
C.Stem.proximal SI.Enterocytes 55
C.Goblet.distal SI.Goblet 51
C.Stem.distal SI.Goblet 42
C.Goblet.proximal SI.Goblet 40
C.Stem.distal SI.Enterocytes 36
C.Stem.distal SI.Stem 26
C.Goblet.distal SI.Enterocytes 7
C.Stem.proximal SI.Paneth 7
C.Stem.proximal Tufft 7
C.Goblet.distal SI.Stem 6
C.Stem.proximal Enteroendocrine 5
C.Goblet.proximal SI.Stem 4
C.Goblet.proximal SI.Enterocytes 2
Blood C.Stem.proximal 1
C.Goblet.distal SI.Paneth 1
C.Stem.distal SI.Paneth 1
Enteroendocrine SI.Enterocytes 1
SI.Goblet SI.Stem 1

Number of fp divided by number of detected connections.

dc <- rowSums(adjustFractions(cObjSng, cObjMul, sObj))
tibble(
  sample = names(fp),
  `False positives (n)` = fp,
  `Deconvolution detected connections (n)` = dc[names(fp)]
) %>% 
  group_by(`Deconvolution detected connections (n)`) %>%
  summarize(`False positives (n)` = sum(`False positives (n)`)) %>%
  as.data.frame()
Deconvolution detected connections (n) False positives (n)
2 408
3 575
4 283
5 30
6 8

Correlation between various parameters and the number of false positives is shown below:

mulNames <- names(fp) #note other multiplets have 0 connections

ec <- estimateCells(cObjSng, cObjMul, warn = FALSE) %>%
  filter(sample %in% mulNames) %>%
  {setNames(pull(., estimatedCellNumber), pull(., sample))}
ec <- ec[mulNames]
dec <- rowSums(adjustFractions(cObjSng, cObjMul, sObj, binary = TRUE))
dec <- dec[mulNames]
c <- getData(sObj, "costs")
c <- c[mulNames]
tc <- colSums(getData(cObjMul, "counts"))
tc <- tc[mulNames]
dg <- apply(getData(cObjMul, "counts"), 2, function(c) length(which(c != 0)))
dg <- dg[mulNames]
feat.do <- getData(cObjMul, "counts.cpm")[features, ] %>%
  matrix_to_tibble("gene") %>%
  gather(sample, cpm, -gene) %>%
  group_by(sample) %>%
  summarize(do = length(which(cpm == 0))) %>%
  {setNames(pull(., do), pull(., sample))}
feat.do <- feat.do[mulNames]

cormat <- cor(matrix(c(fp, ec, dec, c, tc, dg, feat.do), ncol = 7))
corvec <- cormat[2:nrow(cormat), 1]
names(corvec) <- c(
  "ERCC estimated cell number", "Deconvolution estimated cell number", 
  "Deconvolution cost", "Total counts", "Total detected genes", 
  "Drop-outs in features"
)
data.frame(cor = corvec)
cor
ERCC estimated cell number 0.1882968
Deconvolution estimated cell number 0.7280198
Deconvolution cost 0.3296029
Total counts 0.3524432
Total detected genes 0.4834431
Drop-outs in features -0.4391924
cpm <- getData(cObjMul, "counts.cpm")
features <- getData(cObjMul, "features")
cpm[features, ] %>%
  matrix_to_tibble("gene") %>%
  gather(sample, cpm, -gene) %>%
  inner_join(tibble(class = names(features), gene = rownames(cpm)[features])) %>%
  group_by(sample, class) %>%
  summarize(cpm.sum = sum(cpm)) %>%
  ungroup() %>%
  inner_join(tibble(sample = names(fp), fp = fp)) %>%
  ggplot() +
  geom_point(aes(cpm.sum, fp)) +
  facet_wrap(~class)
#multiplets with high fp look to have low expression of the selected features

Distribution of fractions in connections thought to be true vs connections known to be false.

fractions <- getData(sObj, "fractions")
efm %>%
  mutate(fp = case_when(
    str_detect(from, "^SI") & str_detect(to, "^C") ~ TRUE,
    str_detect(to, "^SI") & str_detect(from, "^C") ~ TRUE,
    str_detect(from, "proximal") & str_detect(to, "distal") ~ TRUE,
    str_detect(to, "proximal") & str_detect(from, "distal") ~ TRUE,
    TRUE ~ FALSE
  )) %>%
  mutate(fracs = pmap(list(sample, from, to), function(s, f, t) {
    fractions[s, c(f, t)]
  })) %>%
  unnest() %>%
  ggplot() +
  geom_boxplot(aes(fp, fracs)) +
  labs(x = "False positive", y = "Deconvolution fraction") +
  theme_bw()

Cluster multiplets and look for an overrepresentaion of false positives.

p.dist <- CIMseq::pearsonsDist(getData(cObjMul, "counts.cpm"), CIMseq::selectTopMax(getData(cObjMul, "counts.cpm"), 2000))
set.seed(234899)
u <- uwot::umap(p.dist)
rownames(u) <- colnames(getData(cObjMul, "counts"))
u %>% 
  matrix_to_tibble("sample") %>% 
  inner_join(tibble(sample = names(fp), fp = fp)) %>% 
  mutate(fp.bool = fp > 0) %>% 
  ggplot() + 
  geom_point(aes(V1, V2, colour = fp.bool)) +
  theme_bw() +
  labs(x = "UMAP1", y = "UMAP2") +
  guides(colour = guide_legend(title = "Has false positive?"))
## Joining, by = "sample"

u %>% 
  matrix_to_tibble("sample") %>% 
  inner_join(tibble(sample = names(fp), fp = fp)) %>% 
  ggplot() + 
  geom_point(aes(V1, V2, colour = fp)) +
  theme_bw() +
  scale_colour_viridis_c() +
  labs(x = "UMAP1", y = "UMAP2") +
  guides(colour = guide_legend(title = "False positives (n)"))
## Joining, by = "sample"